In [1]:
import tensorflow as tf
import numpy as np
from tensorflow import data
import shutil
from datetime import datetime
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
from tensorflow.contrib.learn import learn_runner
from tensorflow.contrib.learn import make_export_strategy
print(tf.__version__)
In [2]:
train_data_files = ['data/train-data.csv']
test_data_files = ['data/test-data.csv']
model_name = 'clust-model-02'
resume = False
train = True
preprocess_features = False
extend_feature_colums = False
In [3]:
HEADER = ['key', 'x1', 'x2', 'x3', 'cluster']
HEADER_DEFAULTS = [[0], [0.0], [0.0], [0.0], ['NA']]
FEATURE_NAMES = ['x1', 'x2', 'x3']
UNUSED_FEATURE_NAMES = list(set(HEADER) - set(FEATURE_NAMES))
print("Header: {}".format(HEADER))
print("Input Features: {}".format(FEATURE_NAMES))
print("Unused Features: {}".format(UNUSED_FEATURE_NAMES))
In [4]:
def parse_csv_row(csv_row):
columns = tf.decode_csv(csv_row, record_defaults=HEADER_DEFAULTS)
columns = [tf.expand_dims(tensor, -1) for tensor in columns]
features = dict(zip(HEADER, columns))
for column in UNUSED_FEATURE_NAMES:
features.pop(column)
return features
def process_features(features):
if preprocess_features:
features = features
return features
In [5]:
def csv_input_fn(file_names, mode=tf.estimator.ModeKeys.TRAIN,
skip_header_lines=0,
num_epochs=None,
batch_size=200):
shuffle = False
print("")
print("* data input_fn:")
print("================")
print("Input file(s): {}".format(file_names))
print("Batch size: {}".format(batch_size))
print("Epoch Count: {}".format(num_epochs))
print("Mode: {}".format(mode))
print("Shuffle: {}".format(shuffle))
print("================")
print("")
dataset = data.TextLineDataset(filenames=file_names)
dataset = dataset.skip(skip_header_lines)
dataset = dataset.batch(batch_size)
dataset = dataset.map(lambda csv_row: parse_csv_row(csv_row))
dataset = dataset.map(lambda features: process_features(features))
dataset = dataset.repeat(num_epochs)
iterator = dataset.make_one_shot_iterator()
features = iterator.get_next()
return features, None
In [6]:
features, _ = csv_input_fn(file_names=train_data_files)
print("Feature read from CSV: {}".format(list(features.keys())))
In [7]:
def create_estimator(run_config, hparams):
estimator = tf.contrib.learn.KMeansClustering(
num_clusters = hparams.num_clusters,
initial_clusters= tf.contrib.factorization.RANDOM_INIT,
distance_metric= tf.contrib.factorization.SQUARED_EUCLIDEAN_DISTANCE,
use_mini_batch=True,
mini_batch_steps_per_iteration=1,
kmeans_plus_plus_num_retries=10,
relative_tolerance=None,
config= run_config
)
print("")
print("Estimator Type: {}".format(type(estimator)))
print("")
return estimator
In [8]:
def csv_serving_input_fn():
SERVING_HEADER = ['x1','x2','x3']
SERVING_HEADER_DEFAULTS = [[0.0], [0.0], [0.0]]
rows_string_tensor = tf.placeholder(dtype=tf.string,
shape=[None],
name='csv_rows')
receiver_tensor = {'csv_rows': rows_string_tensor}
row_columns = tf.expand_dims(rows_string_tensor, -1)
columns = tf.decode_csv(row_columns, record_defaults=SERVING_HEADER_DEFAULTS)
columns = [tf.expand_dims(tensor, -1) for tensor in columns]
features = dict(zip(SERVING_HEADER, columns))
return tf.contrib.learn.InputFnOps(
process_features(features),
None,
receiver_tensor
)
In [9]:
def generate_experiment_fn(**experiment_args):
def _experiment_fn(run_config, hparams):
train_input_fn = lambda: csv_input_fn(
train_data_files,
mode = tf.contrib.learn.ModeKeys.TRAIN,
num_epochs=hparams.num_epochs,
batch_size=hparams.batch_size*10
)
eval_input_fn = lambda: csv_input_fn(
train_data_files,
mode = tf.contrib.learn.ModeKeys.EVAL,
num_epochs=1,
batch_size=hparams.batch_size
)
estimator = create_estimator(run_config, hparams)
return tf.contrib.learn.Experiment(
estimator,
train_input_fn=train_input_fn,
eval_input_fn=eval_input_fn,
eval_steps=None,
**experiment_args
)
return _experiment_fn
In [10]:
hparams = tf.contrib.training.HParams(
num_epochs = 1000,
batch_size = 500,
num_clusters=3
)
model_dir = 'trained_models/{}'.format(model_name)
run_config = tf.contrib.learn.RunConfig(
save_checkpoints_steps=100,
tf_random_seed=19850610,
model_dir=model_dir
)
print(run_config.model_dir)
In [11]:
if not resume:
print("Removing previous artifacts...")
shutil.rmtree(model_dir, ignore_errors=True)
else:
print("Resuming training...")
if train:
tf.logging.set_verbosity(tf.logging.INFO)
time_start = datetime.utcnow()
print("Experiment started at {}".format(time_start.strftime("%H:%M:%S")))
print(".......................................")
learn_runner.run(
experiment_fn=generate_experiment_fn(
export_strategies=[make_export_strategy(
csv_serving_input_fn,
exports_to_keep=1
)]
), # not executing export_savedmodel()
run_config=run_config,
schedule="train", #evaluate function is borken, thus not using train_and_evaluate
hparams=hparams
)
time_end = datetime.utcnow()
print(".......................................")
print("Experiment finished at {}".format(time_end.strftime("%H:%M:%S")))
print("")
time_elapsed = time_end - time_start
print("Experiment elapsed time: {} seconds".format(time_elapsed.total_seconds()))
In [12]:
### evaluate function is broken! as well as score function
#estimator.evaluate(input_fn=test_input_fn)
#estimator.score(input_fn=test_input_fn)
train_input_fn = lambda: csv_input_fn(
train_data_files,
num_epochs=1,
batch_size=1500
)
test_input_fn = lambda: csv_input_fn(
test_data_files,
mode=tf.estimator.ModeKeys.EVAL,
num_epochs=1,
batch_size=500
)
estimator = create_estimator(run_config, hparams)
train_assignments = list(estimator.predict_cluster_idx(input_fn=train_input_fn))
test_assignments = list(estimator.predict_cluster_idx(input_fn=test_input_fn))
In [13]:
import pandas as pd
import numpy as np
train_df = pd.read_csv(train_data_files[0], header=None, index_col=0)
test_df = pd.read_csv(test_data_files[0], header=None, index_col=0)
fig = plt.figure(figsize=(20,10))
ax = fig.add_subplot(121, projection='3d')
ax.scatter(train_df.iloc[:,0], train_df.iloc[:,1], train_df.iloc[:,2], c=train_assignments, marker='o')
ax = fig.add_subplot(122, projection='3d')
ax.scatter(test_df.iloc[:,0], test_df.iloc[:,1], test_df.iloc[:,2], c=test_assignments, marker='o')
plt.show()
In [14]:
clusters = estimator.clusters()
print("Cluster Centriods:")
print("==================")
print(clusters)
In [15]:
export_dir = model_dir + "/export"
estimator.export_savedmodel(
export_dir_base = export_dir,
serving_input_fn = csv_serving_input_fn,
as_text=False
)
Out[15]:
In [16]:
%%bash
MODEL_NAME='clust-model-02'
LAST=$(ls trained_models/${MODEL_NAME}/export | tail -1)
SAVE_MODEL_DIR=trained_models/$MODEL_NAME/export/$LAST
ls $SAVE_MODEL_DIR
gcloud ml-engine local predict --model-dir=$SAVE_MODEL_DIR --text-instances='data/new-data.csv'
In [ ]: